library(rvest)
library(writexl)
library(readxl)
library(dplyr)
library(lubridate)
library(haven)
library(effsize)

# roll call votes
myfilter <- function(df){
  temp <- df %>% 
    group_by(rollcall) %>% 
    summarise(total = sum(VOTE, na.rm = T)) %>% 
    filter(total>0)
  temp <- filter(temp, total>0)
  df <- filter(df, rollcall %in% temp$rollcall)
  return(df)
}

myroll <- function(year = 2015, min = 1, max = 705){
  rolls <- list()
  for(i in min:max){
    rm(url, temp,x)
    if(i<10){
      url <- paste0("http://clerk.house.gov/evs/",year,"/roll00",i,".xml")
    } else if (i %in% 10:99){
      url <- paste0("http://clerk.house.gov/evs/",year,"/roll0",i,".xml")
    } else {
      url <- paste0("http://clerk.house.gov/evs/",year,"/roll",i,".xml")
    }
    temp <- read_html(url)
    x <- data.frame(p = as.character(html_nodes(temp, "legislator")), vote = html_text(html_nodes(temp, "vote")))
    x$name <- gsub('^.*sort-field=\"',"",x$p, ignore.case = T)
    x$name <- gsub('\".*$',"", x$name, ignore.case = T)
    x$party <- gsub('^.*party=\"',"",x$p, ignore.case = T)
    x$party <- gsub('\".*$',"", x$party, ignore.case = T)
    x$state <- gsub('^.*state=\"',"",x$p, ignore.case = T)
    x$state <- gsub('\".*$',"", x$state, ignore.case = T)
    if(nrow(x)>0){
      x$url <- url
      x$year <- year
      x$rollcall <- i
      x$VOTE <- NA
      x$VOTE[x$vote %in% c("Yea","Aye")] <- 1
      x$VOTE[x$vote %in% c("Nay","No")] <- 0
    }
    x <- select(x, -p)
    rolls[[i]] <- x
    print(i)
  }
  rolls <- plyr::ldply(rolls)
  rolls <- myfilter(rolls)
  return(rolls)
}

r2015 <- myroll(year=2015)
r2016 <- myroll(year=2016, max = 622)
r2017 <- myroll(year=2017, max = 710)
r2018 <- myroll(year=2018, max = 500)

rollcalls <- bind_rows(r2015,r2016,r2017,r2018)
saveRDS(rollcalls,"newrollcalls.RDS")

rollcalls$id <- paste0(rollcalls$year,rollcalls$rollcall)

myd <- select(rollcalls, year, rollcall, id) %>% 
  unique() %>% 
  mutate(congress_absd = NA)

for(i in 1:nrow(myd)){
  rm(temp)
  temp <- filter(rollcalls, id==myd$id[i]) %>% 
    select(VOTE, party) %>% 
    na.omit()
  temp <- cohen.d(d = temp$VOTE, f = temp$party)
  myd$congress_absd[i] <- abs(temp$estimate)
  print(i)
}

myd <- myd %>%
  group_by(year) %>% 
  mutate(max = max(congress_absd[congress_absd !="Inf"], na.rm = T))
myd$congress_absd[myd$congress_absd=="Inf"] <- myd$max[myd$congress_absd=="Inf"]
myd$congress_absd[is.nan(myd$congress_absd)] <- 0

pap <- read.csv("https://comparativeagendas.s3.amazonaws.com/datasetfiles/US-Legislative_roll_call_votes_19.4.csv")
pap <- rename(pap, "rollcall" = "rc_count")
pap$billnum <- as.numeric(as.character(gsub("[a-z]","",pap$bill,ignore.case = T)))
pap$type <- gsub("[0-9]","",pap$bill)
pap$type <- gsub("/| ","",pap$type)
pap <- filter(pap, type=="HR")
pap$bill_id <- paste(pap$cong,pap$type,pap$billnum,sep="-")

myd.pap <- left_join(myd, pap[,c("rollcall","year","bill_id","pap_majortopic")]) %>% 
  unique()
myd.pap <- filter(myd.pap, pap_majortopic %in% c(1,2,8,9,13,16,18,19))
myd.pap$pap_majortopic[myd.pap$pap_majortopic %in% c(16,18,19)] <- 16
myd.pap <- myd.pap %>% 
  group_by(year) %>% 
  group_by(pap_majortopic, add = T) %>% 
  summarise(congress_absd = mean(congress_absd))

write_dta(myd.pap, "newcongressd.dta", version = 13)
